{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['austen-emma.txt',\n",
       " 'austen-persuasion.txt',\n",
       " 'austen-sense.txt',\n",
       " 'bible-kjv.txt',\n",
       " 'blake-poems.txt',\n",
       " 'bryant-stories.txt',\n",
       " 'burgess-busterbrown.txt',\n",
       " 'carroll-alice.txt',\n",
       " 'chesterton-ball.txt',\n",
       " 'chesterton-brown.txt',\n",
       " 'chesterton-thursday.txt',\n",
       " 'edgeworth-parents.txt',\n",
       " 'melville-moby_dick.txt',\n",
       " 'milton-paradise.txt',\n",
       " 'shakespeare-caesar.txt',\n",
       " 'shakespeare-hamlet.txt',\n",
       " 'shakespeare-macbeth.txt',\n",
       " 'whitman-leaves.txt']"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from gensim.models import word2vec\n",
    "from nltk.corpus import gutenberg\n",
    "gutenberg.fileids()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1010654"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import logging\n",
    "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n",
    "bible_kjv_words = gutenberg.words('bible-kjv.txt')\n",
    "bible_kjv_sents = gutenberg.sents('bible-kjv.txt')  \n",
    "len(bible_kjv_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~'"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from string import punctuation\n",
    "punctuation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "discard_punctuation_and_lowercased_sents = [[word.lower() for word in sent if word not in punctuation] for sent in bible_kjv_sents]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['[', 'The', 'King', 'James', 'Bible', ']']"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "bible_kjv_sents[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['the', 'king', 'james', 'bible']"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "discard_punctuation_and_lowercased_sents[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2017-07-24 18:45:14,744 : INFO : collecting all words and their counts\n",
      "2017-07-24 18:45:14,745 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n",
      "2017-07-24 18:45:14,883 : INFO : PROGRESS: at sentence #10000, processed 315257 words, keeping 7115 word types\n",
      "2017-07-24 18:45:14,978 : INFO : PROGRESS: at sentence #20000, processed 572559 words, keeping 10329 word types\n",
      "2017-07-24 18:45:15,076 : INFO : PROGRESS: at sentence #30000, processed 851171 words, keeping 12741 word types\n",
      "2017-07-24 18:45:15,078 : INFO : collected 12755 word types from a corpus of 854254 raw words and 30103 sentences\n",
      "2017-07-24 18:45:15,079 : INFO : Loading a fresh vocabulary\n",
      "2017-07-24 18:45:15,130 : INFO : min_count=5 retains 5429 unique words (42% of original 12755, drops 7326)\n",
      "2017-07-24 18:45:15,130 : INFO : min_count=5 leaves 841347 word corpus (98% of original 854254, drops 12907)\n",
      "2017-07-24 18:45:15,183 : INFO : deleting the raw counts dictionary of 12755 items\n",
      "2017-07-24 18:45:15,185 : INFO : sample=0.001 downsamples 62 most-common words\n",
      "2017-07-24 18:45:15,187 : INFO : downsampling leaves estimated 583835 word corpus (69.4% of prior 841347)\n",
      "2017-07-24 18:45:15,188 : INFO : estimated required memory for 5429 words and 200 dimensions: 11400900 bytes\n",
      "2017-07-24 18:45:15,260 : INFO : resetting layer weights\n",
      "2017-07-24 18:45:15,393 : INFO : training model with 3 workers on 5429 vocabulary and 200 features, using sg=0 hs=0 sample=0.001 negative=5 window=5\n",
      "2017-07-24 18:45:16,444 : INFO : PROGRESS: at 21.83% examples, 614312 words/s, in_qsize 5, out_qsize 0\n",
      "2017-07-24 18:45:17,452 : INFO : PROGRESS: at 39.29% examples, 558581 words/s, in_qsize 5, out_qsize 0\n",
      "2017-07-24 18:45:18,459 : INFO : PROGRESS: at 53.89% examples, 516221 words/s, in_qsize 6, out_qsize 1\n",
      "2017-07-24 18:45:19,462 : INFO : PROGRESS: at 73.22% examples, 526311 words/s, in_qsize 6, out_qsize 0\n",
      "2017-07-24 18:45:20,463 : INFO : PROGRESS: at 88.66% examples, 514927 words/s, in_qsize 6, out_qsize 0\n",
      "2017-07-24 18:45:20,915 : INFO : worker thread finished; awaiting finish of 2 more threads\n",
      "2017-07-24 18:45:20,919 : INFO : worker thread finished; awaiting finish of 1 more threads\n",
      "2017-07-24 18:45:20,935 : INFO : worker thread finished; awaiting finish of 0 more threads\n",
      "2017-07-24 18:45:20,937 : INFO : training on 4271270 raw words (2917995 effective words) took 5.5s, 527138 effective words/s\n"
     ]
    }
   ],
   "source": [
    "bible_kjv_word2vec_model = word2vec.Word2Vec(discard_punctuation_and_lowercased_sents, min_count=5, size=200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2017-07-24 18:46:38,749 : INFO : saving Word2Vec object under bible_word2vec_gensim, separately None\n",
      "2017-07-24 18:46:38,751 : INFO : not storing attribute syn0norm\n",
      "2017-07-24 18:46:38,752 : INFO : not storing attribute cum_table\n",
      "2017-07-24 18:46:38,939 : INFO : saved bible_word2vec_gensim\n"
     ]
    }
   ],
   "source": [
    "bible_kjv_word2vec_model.save(\"bible_word2vec_gensim\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2017-07-24 18:46:39,504 : INFO : precomputing L2-norms of word weight vectors\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[('truth', 0.8255427479743958),\n",
       " ('salvation', 0.7879523038864136),\n",
       " ('spirit', 0.7764705419540405),\n",
       " ('christ', 0.7500270009040833),\n",
       " ('lord', 0.7459903955459595),\n",
       " ('righteousness', 0.7290480136871338),\n",
       " ('hosts', 0.7282350659370422),\n",
       " ('grace', 0.7198275923728943),\n",
       " ('mercy', 0.7179147005081177),\n",
       " ('glory', 0.716149628162384),\n",
       " ('faith', 0.6987133026123047),\n",
       " ('gospel', 0.6868479251861572),\n",
       " ('wisdom', 0.6749225854873657),\n",
       " ('judgment', 0.6667770743370056),\n",
       " ('hope', 0.6638231873512268),\n",
       " ('fear', 0.6611486673355103),\n",
       " ('thus', 0.6549487709999084),\n",
       " ('prayer', 0.6460980772972107),\n",
       " ('word', 0.6433387398719788),\n",
       " ('kingdom', 0.6429972648620605),\n",
       " ('power', 0.6319444179534912),\n",
       " ('strength', 0.6196019053459167),\n",
       " ('voice', 0.6156880259513855),\n",
       " ('servant', 0.6125010848045349),\n",
       " ('commandment', 0.6105615496635437),\n",
       " ('holy', 0.6094077825546265),\n",
       " ('law', 0.6089072227478027),\n",
       " ('name', 0.6086355447769165),\n",
       " ('who', 0.608123242855072),\n",
       " ('covenant', 0.6066034436225891)]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "bible_kjv_word2vec_model.most_similar([\"god\"], topn=30)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
